Please fill out:



OBTAIN: Import data, inspect, check for datatypes to convert and null values
SCRUB: cast data types, identify outliers, check for multicollinearity, normalize data
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import numpy as np
%matplotlib inline
inline_rc = dict(mpl.rcParams)
plt.style.use('seaborn')
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import normaltest as normtest # D'Agostino and Pearson's omnibus test
from collections import Counter
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
df = pd.read_csv('kc_house_data.csv')
# Set index, create dataframe for dropped variables with id as index for both
df.set_index=('id')
drop_me =['id','lat','long','date']
df_dropped = df[drop_me].copy()
df_dropped.set_index('id')
df.drop(drop_me,axis=1,inplace=True)
folder = %pwd
fig_filepath = folder+'/Figures/'
data_filepath = folder+'/Data/'
df.index
# Check columns returns the datatype, null values and unique values of input series
def check_column(series,nlargest='all'):
print(f"Column: df['{series.name}']':")
print(f"dtype: {series.dtype}")
print(f"isna: {series.isna().sum()} out of {len(series)} - {round(series.isna().sum()/len(series)*100,3)}%")
print(f'\nUnique non-na values:') #,df['waterfront'].unique())
if nlargest =='all':
print(series.value_counts())
else:
print(series.value_counts().nlargest(nlargest))
# define log + z-score
def log_z(col):
col = np.array(col)
logcol = np.log(col)
zlogcol = (logcol-np.mean(logcol))/np.sqrt(np.var(logcol))
return zlogcol
def rem_out_z(col_name):
col = np.array(col_name)
z_col = (col - np.mean(col)) / np.sqrt(np.var(col))
z_col[abs(z_col)>3]=np.nan
return z_col
# MULTIPLOT
from string import ascii_letters
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
def multiplot(df):
sns.set(style="white")
# Compute the correlation matrix
corr = df.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(16, 16))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, annot=True, cmap=cmap, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5}) #
# Plots histogram and scatter (vs price) side by side
def plot_hist_scat(df,target='price',stats=False):
# plt.style.use('bmh')
plt.style.use('fivethirtyeight')
# fontTitle = {'family': 'serif',
# 'color': 'black',
# 'weight': 'normal',
# 'size': 16,
# }
# fontTicks = {'family': 'sans-serif',
# 'color': 'black',
# 'weight': 'normal',
# 'size': 10,
# }
results = [['column','K_square','p-val']]
for column in df.describe():
fig = plt.figure(figsize=(8,3) )#plt.figaspect(0.5))#(5,4))
ax1 = fig.add_subplot(121)
ax1.hist(df[column],density=True,label = column+' histogram',bins=20)
ax1.set_title(column.capitalize())
ax1.legend()
ax2 = fig.add_subplot(122)
ax2.scatter(x=df[column], y=df[target],label = column+' vs price',marker='.')
ax2.set_title(column.capitalize())
ax2.legend()
fig.tight_layout()
if stats==True:
stat, p = normtest(df[column])
# print(f'Normality test for {column}:K_square = {stat}, p-value = {p}')
results.append([column,stat, p])
return pd.DataFrame(results)
#SEABORN
import matplotlib.ticker as mtick
import matplotlib.pyplot as plt
# Plots histogram and scatter (vs price) side by side
def plot_hist_scat_sns(df,target='price'):
plt.style.use('dark_background')
## ----------- DEFINE AESTHETIC CUSTOMIZATIONS ----------- ##
# Axis Label fonts
fontTitle = {'fontsize': 16,
'fontweight': 'bold',
'fontfamily':'serif'}
fontAxis = {'fontsize': 14,
'fontweight': 'bold',
'fontfamily':'serif'}
fontTicks = {'fontsize': 12,
'fontweight':'bold',
'fontfamily':'serif'}
# Formatting dollar sign labels
fmtPrice = '${x:,.0f}'
tickPrice = mtick.StrMethodFormatter(fmtPrice)
## ----------- PLOTTING ----------- ##
## Loop through dataframe to plot
for column in df.describe():
# Create figure with subplots for current column
# Note: in order to use identical syntax for large # of subplots (ax[i,j]),
# declare an extra row of subplots to be removed later
fig, ax = plt.subplots(figsize=(12,10), ncols=2, nrows=2)
## ----- SUBPLOT 1 -----##
i,j = 0,0
ax[i,j].set_title(column.capitalize(),fontdict=fontTitle)
# Define graphing keyword dictionaries for distplot (Subplot 1)
hist_kws = {"linewidth": 1, "alpha": 1, "color": 'blue','edgecolor':'w'}
kde_kws = {"color": "white", "linewidth": 1, "label": "KDE"}
# Plot distplot on ax[i,j] using hist_kws and kde_kws
sns.distplot(df[column], norm_hist=True, kde=True,
hist_kws = hist_kws, kde_kws = kde_kws,
label=column+' histogram', ax=ax[i,j])
# Set x axis label
ax[i,j].set_xlabel(column.title(),fontdict=fontAxis)
# Get x-ticks, rotate labels, and return
xticklab1 = ax[i,j].get_xticklabels(which = 'both')
ax[i,j].set_xticklabels(labels=xticklab1, fontdict=fontTicks, rotation=45)
ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())
# Set y-label
ax[i,j].set_ylabel('Density',fontdict=fontAxis)
yticklab1=ax[i,j].get_yticklabels(which='both')
ax[i,j].set_yticklabels(labels=yticklab1,fontdict=fontTicks)
ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter())
# Set y-grid
ax[i, j].set_axisbelow(True)
ax[i, j].grid(axis='y',ls='--')
## ----- SUBPLOT 2----- ##
i,j = 0,1
ax[i,j].set_title(column.capitalize(),fontdict=fontTitle)
# Define the ketword dictionaries for scatter plot and regression line (subplot 2)
line_kws={"color":"white","alpha":0.5,"lw":4,"ls":":"}
scatter_kws={'s': 2, 'alpha': 0.5,'marker':'.','color':'blue'}
# Plot regplot on ax[i,j] using line_kws and scatter_kws
sns.regplot(df[column], df[target],
line_kws = line_kws,
scatter_kws = scatter_kws,
ax=ax[i,j])
# Set x-axis label
ax[i,j].set_xlabel(column.title(),fontdict=fontAxis)
# Get x ticks, rotate labels, and return
xticklab2=ax[i,j].get_xticklabels(which='both')
ax[i,j].set_xticklabels(labels=xticklab2,fontdict=fontTicks, rotation=45)
ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())
# Set y-axis label
ax[i,j].set_ylabel('Price',fontdict=fontAxis)
# Get, set, and format y-axis Price labels
yticklab = ax[i,j].get_yticklabels()
ax[i,j].set_yticklabels(yticklab,fontdict=fontTicks)
ax[i,j].get_yaxis().set_major_formatter(tickPrice)
# Set y-grid
ax[i, j].set_axisbelow(True)
ax[i, j].grid(axis='y',ls='--')
## ---------- Final layout adjustments ----------- ##
# Deleted unused subplots
fig.delaxes(ax[1,1])
fig.delaxes(ax[1,0])
# Optimizing spatial layout
fig.tight_layout()
figtitle=column+'_dist_regr_plots.png'
plt.savefig(fig_filepath+figtitle)
return
# Tukey's method using IQR to eliminate
def detect_outliers(df,n,features):
outlier_indices = []
# iterate over features(columns)
for col in features:
# 1st quartile (25%)
Q1 = np.percentile(df[col], 25)
# 3rd quartile (75%)
Q3 = np.percentile(df[col],75)
# Interquartile range (IQR)
IQR = Q3 - Q1
# outlier step
outlier_step = 1.5 * IQR
# Determine a list of indices of outliers for feature col
outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
# append the found outlier indices for col to the list of outlier indices
outlier_indices.extend(outlier_list_col)
# select observations containing more than 2 outliers
outlier_indices = Counter(outlier_indices)
multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
return multiple_outliers
# Outliers_to_drop = detect_outliers(data,2,["col1","col2"])
# df.loc[Outliers_to_drop] # Show the outliers rows
# Drop outliers
# data= data.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)
df.head()
# Display DataFrame Infro
df.info()
# Check for columns with null values (remember strings/objects are not counted here)
res = df.isna().sum()
print(res[res>0],'\nTotal:',len(df))
# Waterfront, view, yr_renovated have missing values
# Recast zipcode as a category
df['zipcode'] = df['zipcode'].astype('category')
# RECASTING SQFT_BASEMENT
# Checking why sqft_basement might be an object:
# df['sqft_basement'].value_counts().nlargest(10)
check_column(df['sqft_basement'],10)
# Removing rows with ? for sqft_basement and converting to float
print(len(df))
df['sqft_basement'].replace(to_replace = '?',value=np.nan,inplace=True) #,inplace=True)
df.dropna(axis=0,subset=['sqft_basement'],inplace=True)
df['sqft_basement'] = df['sqft_basement'].astype('float')
df['sqft_basement'].isna().sum()
check_column(df['sqft_basement'],10)
df.info()
# Check for columns with null values (remember strings/objects are not counted here)
res = df.isna().sum()
print(res[res>0])
# Waterfront, view, yr_renovated have missing values
check_column(df['view'])
# Drop null values from view and re-check column
print(len(df))
df.dropna(axis=0,subset=['view'],inplace=True)
print(len(df))
check_column(df['view'])
# Convert view to category and create a coded version called code_view
df['view'] = df['view'].astype('category')
df['code_view'] = df.view.cat.codes
df['code_view'].value_counts()
# Drop drop_me variable from main df, save in df_dropped
drop_me = 'view'
df_dropped[drop_me] = df[drop_me].copy()
df.drop(drop_me,axis=1,inplace=True)
#Recheck whole df
df.info()
check_column(df['waterfront'])
# Convert waterfront to category, replace null values with "NaN"' string to make it a category
df['waterfront'] = df['waterfront'].astype('category')
df['waterfront'].replace(np.nan,"NaN",inplace=True)
df['waterfront'] = df['waterfront'].astype('category')
# df['waterfront'].value_counts()
check_column(df['waterfront'],10)
# Turn waterfront into coded column
df['code_waterfront'] = df.waterfront.cat.codes
check_column(df['code_waterfront'])
# Drop drop_me variable from main df, save in df_dropped
drop_me = 'waterfront'
df_dropped[drop_me] = df[drop_me].copy()
df.drop(drop_me,axis=1,inplace=True)
df.info()
check_column(df['yr_renovated'],10)
# Fill in 0.0 yr_renovated as np.nan temporarily
df['yr_renovated'].replace(0.0,np.nan,inplace=True)
# Recheck for null values
check_column(df['yr_renovated'],10) #df.isna().sum()
# Get indices of all np.nan values to be used for is_reno category below
idx_nan = df['yr_renovated'].loc[df['yr_renovated'].isna()].index
idx_notnan = df['yr_renovated'].loc[~df['yr_renovated'].isna()].index
# Make df['is_reno']
df['is_reno'] = df['yr_renovated'].copy
df['is_reno'][idx_nan] = 0
df['is_reno'][idx_notnan] = 1
check_column(df['is_reno'],10)
# Drop drop_me variable from main df, save in df_dropped
drop_me = 'yr_renovated'
df_dropped[drop_me] = df[drop_me].copy()
df.drop(drop_me,axis=1,inplace=True)
df.info()
# Plot histograms and scatterplots vs target variable price for all numerical columns in df (show up in .describe())
fig = plot_hist_scat(df)
# fig = plot_hist_scat_sns(df)
# Create tuples with columns names of categorical variables for easy extraction
cat_vars = ('waterfront','view','zipcode')
ord_vars = ('grade','condition','floors','bedrooms','bathrooms')
vars_to_bin = ('yr_built','yr_renovated','sqft_above','sqft_basement')
num_vars = ('sqft_living', 'sqft_lot','sqft_living15', 'sqft_lot15')
# Check the min and max of the yr variables for binning range
df['yr_built'].describe()
## Bin yr_built then make yr_built category
# yr_built min is 1900, max is 2015
bins = list(range(1900,2030,10))
df['yr_built'].replace(np.nan,0,inplace=True)
bins_yrbuilt = pd.cut(df['yr_built'], bins,include_lowest=True) # Cut into bins
# check_column(bins_yrbuilt)
df['bins_yrbuilt'] = bins_yrbuilt.astype('category').cat.as_ordered() #.cat.as_ordered()
# Inspecting the binned data counts
check_column(df['bins_yrbuilt'])
# Drop original
drop_me = 'yr_built'
df_dropped[drop_me] = df[drop_me].copy()
df.drop(drop_me,axis=1,inplace=True)
df['sqft_basement'].describe()
check_column(df['sqft_basement'],10)
# Definine bins where bins 0-1 is its own interval
bins=[-np.inf]
[bins.append(x) for x in range(1,5500,500)]
# bins
# cut_basement = df['sqft_basement'].replace(0,np.nan)
cut_basement = df['sqft_basement'].replace(np.nan,0).copy()
# cut_basement = cut_basement.replace('NaN',0)
bins_sqftbase = pd.cut(cut_basement, bins=bins, include_lowest=True) # Cut into bins
df['bins_sqftbasement'] = bins_sqftbase.copy()
# Cast as ordered category
df['bins_sqftbasement'] = df['bins_sqftbasement'].astype('category').cat.as_ordered()
# Check result
check_column(df['bins_sqftbasement'])
# Drop original
drop_me = 'sqft_basement'
df_dropped[drop_me] = df[drop_me].copy()
df.drop(drop_me,axis=1,inplace=True)
check_column(df['sqft_above'],10)
df['sqft_above'].describe()
# sqft_above
# Bins to cover range seen above in .describe
bins = list(range(0,9501,500))
# cut_above = df['sqft_above'].replace(0,np.nan)
bins_sqftabove = pd.cut(df['sqft_above'], bins=bins, include_lowest=True) # Cut into bins, including left edge
check_column(bins_sqftabove)
bins_sqftabove.replace(np.nan,'NaN',inplace=True)
df['bins_sqftabove']=bins_sqftabove.astype('category').cat.as_ordered()
check_column(df['bins_sqftabove'])
# Drop original
drop_me = 'sqft_above'
df_dropped[drop_me] = df[drop_me].copy()
df.drop(drop_me,axis=1,inplace=True)
df.info()
# to_drop_copy = ['sqft_above','sqft_basement','yr_built','waterfront','view']
# df_dropped = df[to_drop_copy].copy()
# # df.drop(to_drop_copy,axis=1,inplace=True)
# Plot correlation heatmaps for all data
# pause
to_drop = ['price']
multiplot(df.drop(to_drop,axis=1))
# # Test which multicol variables to keep
# pause
# df[['price','sqft_living','sqft_living15','sqft_above','sqft_basement']].corr()
# # Save copy of data in current form.
# df.to_csv(data_filepath+ 'scrubbed_data.csv')
# df_dropped.to_csv(data_filepath+'dropped_data.csv')
df.info()
df.head()
df.describe()
# Separate out numerical
df.describe()
# INSPECTING NUMERICAL DATA DISTPLOTS + KDE
plt.style.use('seaborn')
plot_hist_scat_sns(df)
# for column in df.describe().columns:
# sns.distplot(df[column],kde=True,label = column+' histogram',norm_hist=True)
# title = column+' histogram & KDE'
# plt.title(title.title())
# plt.ylabel('Density')
# plt.legend()
# plt.show()
# Uncomment below if want new scatter_matrix
# fig = pd.plotting.scatter_matrix(df_clean,figsize=(12,12))
df.info()
# Define list of all categorical variables
list_cat_vars = ['zipcode', 'bins_yrbuilt', 'bins_sqftbasement', 'bins_sqftabove','condition','grade','code_view','code_waterfront']
# Plotting scatterplots and boxplots for categorical data
plt.style.use('seaborn')
for column in list_cat_vars:
fig = plt.figure(figsize=(12,4))
ax1 = fig.add_subplot(121)
ax1 = sns.stripplot(x=df[column],y=df['price'],marker='.')
# ax1.set_title('Price vs ',column,' scatter plot')
title1=column+' scatter'
ax1.set_title(title1.title())
ax1.set_xlabel(column)
ax1.set_ylabel('Price')
ax2 = fig.add_subplot(122)
ax2 = sns.boxplot(x=df[column],y=df['price'])
title2=column+' boxplot'
ax2.set_title(title2.title())
ax2.set_xlabel(column)
ax2.set_ylabel('Price')
fig.tight_layout()
Notes on categorical scatter plots
# # INSPECTING REGRESSION PLOTS
# plt.style.use('seaborn')
# plot_vars=df.describe().columns
# for column in plot_vars:
# # df_plot=df[column]
# # df_plot = df.loc[df[column]>0]
# plot= sns.regplot(df[column], df['price'],robust=False,marker='.') #kde=True,label = column+' histogram')
# # plot = sns.boxplot(df[column],df['price'])
# title = column+' linear regression'
# plt.title(title.title())
# # plt.legend()
# plt.show()
df.describe()
Graphing raw vs normalized results to decide if dataset should be normalized
df.info()
# Define variables to plot vs tukey-cleaned outliers
vars_to_norm = ['sqft_living','sqft_lot','sqft_living15','sqft_lot15','bedrooms','bathrooms']
df.describe().columns
from collections import Counter
from sklearn.preprocessing import RobustScaler
robscaler = RobustScaler()
# vars_to_norm = ['sqft_living','sqft_lot','sqft_living15','sqft_lot15','bedrooms','bathrooms']
norm_results = [['column','K_square','p-val']]
# Graph all potential normalizedvariables
for var_name in df.describe():
var = df[var_name]
fig = plt.figure(figsize=(12,4))
ax1 = fig.add_subplot(121)
ax1 = sns.distplot(var, norm_hist=True)
ax1.set_title('Raw '+var_name)
#robscaler.fit_transform(np.array(var).reshape(-1,1)
# OUTLIER REMOVAL
outliers_to_drop = detect_outliers(df,0,[var_name])
var_clean =df[var_name].drop(outliers_to_drop)
ax2 = fig.add_subplot(122)
ax2 = sns.distplot(var_clean,norm_hist=True)
# ax2 = sns.distplot(robscaler.fit_transform(np.array(var_clean).reshape(-1,1)),norm_hist=True)
ax2.set_title('Tukey Outliers Removed '+var_name) #+var)
ax2.set_xlabel('Scale')
stat, p = normtest(var_clean)
# norm_results.append([var_clean,stat, p])
# norm_results = pd.DataFrame(norm_results[2:],columns=norm_results[0])#,index='columns')
# norm_results
Notes on basic statistics¶
- Bedrooms has some very clear outliers (max is 33, but 75% quartile is only 4)
- May want to remove outliers after Z-scoring (Absolute Z-score > 3)
- Same with bathrooms (8 is max, 75% quartile is only 2.5)
- Same with sqft_living (max 13540, 75% quartile = 2550)
- Also same with sqft_lot15, sqftliving15
# DEFINE VARIABLES TO GET THE OUTLIERS FOR (based on observations)
# vars_to_norm = ['price','bedrooms''sqft_living','sqft_lot','sqft_living15','sqft_lot15','bedrooms','bathrooms'
vars_to_clean = ['price','bedrooms','sqft_living','bathrooms','sqft_living15']
# GET OUTLIER INDICES AND REPORT
outliers_to_drop = {}
for col in vars_to_clean:
outliers_to_drop[col] = detect_outliers(df,0,[col])
# outliers_to_drop.keys()
# outliers_to_drop.values()
# Print out # of outliers
for k, v in outliers_to_drop.items():
print(f'col: {k} has {len(v)} outliers. ({round(len(v)/len(df),2)*100}%)' )
# Intialize df_norm with df's values
df_norm=df.copy()
# Iterate throught outliers_to_drop dictionary to replace outliers with np.nan
for k, v in outliers_to_drop.items():
df_norm.loc[v,k] = np.nan # axis=0,inplace=True)
# Display null values
df_norm.isna().sum()
# REMOVING OUTLIERS FROM PRICE
drop_col = 'price'
print(f'for {drop_col}:')
print(f'# of intial rows: {len(df_norm)}')
df_norm.dropna(subset=['price'],inplace=True)
print(f'# after dropping rows: {len(df_norm)}')
print(f'\nOutliers remaining: \n{df_norm.isna().sum()}')
# REMOVING OUTLIERS FROM BEDROOMS
drop_col = 'bedrooms'
print(f'for {drop_col}:')
print(f'# of intial rows: {len(df_norm)}')
df_norm.dropna(subset=[drop_col],inplace=True)
print(f'# after dropping rows: {len(df_norm)}')
print(f'\nOutliers remaining: \n{df_norm.isna().sum()}')
# REMOVING OUTLIERS FROM PRICE
drop_col = 'sqft_living'
print(f'for {drop_col}:')
print(f'# of intial rows: {len(df_norm)}')
df_norm.dropna(subset=[drop_col],inplace=True)
print(f'# after dropping rows: {len(df_norm)}')
print(f'\nOutliers remaining: \n{df_norm.isna().sum()}')
# REMOVING OUTLIERS FROM BATHROOMS
drop_col = 'bathrooms'
print(f'for {drop_col}:')
print(f'# of intial rows: {len(df_norm)}')
df_norm.dropna(subset=[drop_col],inplace=True)
print(f'# after dropping rows: {len(df_norm)}')
print(f'\nOutliers remaining: \n{df_norm.isna().sum()}')
drop_col = 'sqft_living15'
print(f'for {drop_col}:')
print(f'# of intial rows: {len(df_norm)}')
df_norm.dropna(subset=[drop_col],inplace=True)
print(f'# after dropping rows: {len(df_norm)}')
print(f'\nOutliers remaining: \n{df_norm.isna().sum()}')
# ADDING OUTLIER REMOVAL FROM preprocessing.RobuseScaler
from sklearn.preprocessing import RobustScaler
robscaler = RobustScaler()
robscaler
vars_to_scale = ['sqft_living','sqft_lot','sqft_living15','sqft_lot15','bedrooms','bathrooms']
for col in vars_to_scale:
col_data = np.array(np.array(df_norm[col]))
res = robscaler.fit_transform(col_data.reshape(-1,1)) #,df['price'])
df_norm['sca_'+col] = res.flatten()
# IF DROPPING VARS UNCOMMENT BELOW
# df_norm.drop(vars_to_scale,axis=1,inplace=True)
df_norm.info()
df_norm.head()
# # Normalize chosen vars
# # df_norm = pd.DataFrame({})
# df_norm['logz_price'] = log_z(df['price'])
# df_norm['logz_sqft_living'] = log_z(df['sqft_living'])a
# df_norm['logz_sqft_living15'] = log_z(df['sqft_living15'])
# df_norm['logz_sqft_lot'] = log_z(df['sqft_lot'])
# df_norm['logz_sqft_lot15'] = log_z(df['sqft_lot15'])
# df_norm.index = df.index
# Z-SCORE AND REMOVE >3 STD
# vars_dont_norm = ['yr_renovated','yr_built','floors','view','grade','condition']
# vars_to_norm = df.describe().drop(vars_dont_norm,axis=1).columns
# vars_to_rem_out = None
# col_names =vars_to_rem_out
# df_norm = df[vars_to_norm].copy()
# df_norm['id'] = df['id'].copy()
# df_norm.set_index('id')
# df_norm.info()
# for col in col_names:
# if col != 'id':
# df_norm['zClean_'+col] = rem_out_z(df[col])
# # df_norm = df_norm.filter(regex=('log'),axis=1)
# df_norm.info()
# Drop na
# # df_run = pd.DataFrame({)
# temp_df = df.filter(regex=('bin'),axis=1)
# df_run = pd.concat([df_norm,temp_df],axis=1)#,sort=False)
# df_run.describe()
# # LOG TRANSFORM AND Z-SCORE DATA
# col_names = vars_to_norm
# df_norm = df[vars_to_norm].copy()
# # df_norm.info()
# for col in col_names:
# data_to_log = df.loc[df[col]>0]
# df_norm['logZ_'+col] = log_z(data_to_log[col])
# # df_norm['logZ_'+col] = log_z(df[col])
# df_norm = df_norm.filter(regex=('log'),axis=1)
# df_norm.info()
plt.style.use('fivethirtyeight')
plot_hist_scat_sns(df_norm)
df_norm.info()
multiplot(df_norm.filter(regex='sca',axis=1))
plt.title('Scaled Data only')
X =df_norm.loc[:,~(df_norm.columns.str.startswith('sca'))]
multiplot(X.drop('price',axis=1))
plt.title('Un-scaled Data')
X=[]
# DEFINING DATASET TO RUN (df_run)
df_run = df_norm.copy()
# df_run.to_csv(data_filepath+'df_run_pre_codes.csv')
df_run.info()
df_filt = pd.DataFrame({})
df_filt = df_run.filter(regex=('bins_'),axis =1).copy()
df_filt['zipcode']=df_run['zipcode'].copy()
df_filt.info()
# Creating binned vars cat codes
for col in df_filt:
df_filt['code'+col] = df_filt[col].cat.codes
df_filt.drop(col,axis=1,inplace=True)
df_filt.info()
df_filt.head()
## Select columns that do not contain the string 'logZ'
# df_run =df_run.loc[:,~(df_run.columns.str.startswith('logZ'))]
# # df_filt['price'] = df_run['price'].copy()
# df_run =df_run.loc[:,~(df_run.columns.str.startswith('bin'))]
# df_run.info()
# # DECIDE NOW IF WANT CLEAN OR NOT CLEANED DATA
# to_drop_unclean = ['bedrooms','bathrooms','sqft_living','sqft_lot','sqft_living15','sqft_lot15','zClean_price']
# to_drop_clean = list(df_run.loc[:,~(df_run.columns.str.startswith('zClean'))].columns)
# to_drop_unclean, to_drop_clean
# DROP UNCLEAN
# df_run.drop(['id','bedrooms','bathrooms','sqft_living','sqft_lot','sqft_living15','sqft_lot15','zClean_price'],axis=1,inplace=True)
# df_run.drop(to_drop_unclean,axis=1,inplace=True)
# Concatenate codebins from df_filt + df_run
df_run = pd.concat([df_run, df_filt],axis=1) #).filter(regex=('code'))],axis=1)
df_run.info()
df_run.head(2)
# Select columns that do not contain the string 'logZ'
df_run =df_run.loc[:,~(df_run.columns.str.startswith('logZ'))]
# bookmark 202am
# df_run.to_csv(data_filepath+'df_run_coded.csv')
# df_run = pd.read_csv('df_run_coded.csv')
# df_run.info()
df_run.info()
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
import sklearn.metrics as metrics
df_run.info()
# Define selector function combining RFE and linear regression
linreg = LinearRegression()
selector = RFE(linreg, n_features_to_select=1)
# Drop already scaled variables for this feature testing
X =df_run.loc[:,~(df_run.columns.str.startswith(('bins','zip')))]
X = X.drop('price',axis=1)
# RUNNING RFE ON THE UNSCALED DATA(DEMONSTRATION)
Y = df_run['price']
# Y = df_run['logz_price']
# X = df_run.drop(['price'],axis=1)
# Checking X
X.info()
# Run regressions on X,Y
selector = selector.fit(X,Y)
# Saving unscaled rankings for demo purposes
no_scale = selector.ranking_
# Scale all variables to value between 0-1 to use RFE to determine which features are the most important for determining price?
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# Scale the data before running RFE
print('Consider revisiting this step and dummy-coding zipcode.')
# ONLY SCALE NON-CATEGORICAL, ONE-HOT CATEGORICAL
scaler.fit(X,Y)
scaled_data = scaler.transform(X)
scaled_data.shape
# Running RFE with scaled data
selector = selector.fit(scaled_data, Y)
scaled = selector.ranking_
# Create a dataframe with the ranked values of each feature for both scaled and unscaled data
best_features = pd.DataFrame({'columns':X.columns, 'scaled_rank' : scaled,'unscaled_rank':no_scale})
best_features.set_index('columns',inplace=True)
# Display dataframe (sorted based on unscaled rank)
best_features.sort_values('unscaled_rank')
# Plot the difference in feature importance between analyzing scaled and unscaled data.
# For demonstration purposes.
features = pd.DataFrame({'Columns':X.columns, 'Not_Scaled':no_scale, 'Scaled':scaled})
# PLot the difference between
plt.style.use('seaborn')
# plt.xkcd()
ax = features.set_index('Columns').plot(kind = 'bar',figsize=(12,8))
ax.set_title('Feature Importance Scaled vs. Not Scaled')
ax.set_ylabel('Features Importance')
X.info()
r_squared = []
for x in range(1, len(X.columns)):
selector = RFE(linreg, n_features_to_select=x)
selector.fit(scaled_data, Y)
linreg.fit(X[X.columns[selector.support_]], Y)
r_sq = linreg.score(X[X.columns[selector.support_]], Y)
r_squared.append(r_sq)
# r_squared
from sklearn.metrics import mean_squared_error
mse=[]
for x in range(1, len(X.columns)):
selector = RFE(linreg, n_features_to_select=x)
selector.fit(scaled_data, Y)
linreg.fit(X[X.columns[selector.support_]], Y)
y_pred = linreg.predict(X[X.columns[selector.support_]])
mse.append(mean_squared_error(Y, y_pred))
# mse
from sklearn.metrics import mean_squared_error
fig = plt.figure(figsize=(12 ,6))
ax1 = plt.subplot(121)
ax1.plot(range(1, len(X.columns)), r_squared)
ax1.set_ylabel('R_Squared')
ax1.set_xlabel('Number of Features')
ax1.grid()
ax2 = plt.subplot(122)
ax2.plot(range(1,len(mse)+1), mse )
ax2.set_ylabel('MSE')
ax2.set_xlabel('Number of Features',fontsize=20)
ax2.grid()
r_squared = []
for x in range(1, len(X.columns)):
selector = RFE(linreg, n_features_to_select=x)
selector.fit(X, Y)
linreg.fit(X[X.columns[selector.support_]], Y)
r_sq = linreg.score(X[X.columns[selector.support_]], Y)
r_squared.append(r_sq)
from sklearn.metrics import mean_squared_error
mse=[]
for x in range(1, len(X.columns)):
selector = RFE(linreg, n_features_to_select=x)
selector.fit(X, Y)
linreg.fit(X[X.columns[selector.support_]], Y)
y_pred = linreg.predict(X[X.columns[selector.support_]])
mse.append(mean_squared_error(Y, y_pred))
fig = plt.figure(figsize=(12 ,6))
ax1 = plt.subplot(121)
ax1.plot(range(1, len(X.columns)), r_squared)
ax1.set_ylabel('R_Squared')
ax1.set_xlabel('Number of Features')
ax1.grid()
ax2 = plt.subplot(122)
ax2.plot(range(1,len(mse)+1), mse )
ax2.set_ylabel('MSE')
ax2.set_xlabel('Number of Features')
ax2.grid()
best_features.sort_values('unscaled_rank')
# Pull out the 6 best features via ranking
best_num_feat = 6
selected_features = best_features.sort_values('unscaled_rank')[0:best_num_feat]
selected_features
# USING UNSCALED
selected_features.index
# Check the columns of X
X[selected_features.index].head()
df_run.info()
# Running simple linear regression for each predictor on its own
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as stats
import statsmodels.stats.api as sms
# log_price = np.log(df['price'])
# df['log_price'] = log_price
target_var = 'price'
col_names = df_run.drop(['price'],axis=1).columns
# Create results list for saving the output statstics for each predictor
results = [['ind_var', 'r_squared', 'intercept', 'slope', 'p-value' ]]
for idx, val in enumerate(col_names):
# Use the names of the columns to determine format of forumla
if val.startswith('code'):
df_run[val] = df_run[val].astype('category').cat.as_ordered()
f =f'{str(target_var)}~C({val})'
elif val.startswith('bin'):
df_run[val] = df_run[val].cat.as_ordered()
f =f'{str(target_var)}~C({val})'
else:
f =f'{str(target_var)}~{val}'
# Run the ols models
model = smf.ols(formula=f, data=df_run).fit()
model.summary()
# Append results
results.append([val, model.rsquared, model.params[0], model.params[1], model.pvalues[1] ])
# Turn results into dataframe with correct index and columns
res_df = pd.DataFrame(results)
res_df.columns = res_df.iloc[0]
res_df=res_df[1:]
res_df.set_index('ind_var',inplace=True)
res_df.sort_values('r_squared',ascending=False)
# Initial variables for modeling
try_modeling = ['codezipcode', 'grade','sca_sqft_living', 'sca_sqft_living15']
# Hmm...realized there are redundant versions of variables and am just selecting the correct versions to use.
try_modeling = try_modeling = ['codezipcode', 'grade','sca_sqft_living', 'sca_sqft_living16']
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as stats
import statsmodels.stats.api as sms
from sklearn.preprocessing import MinMaxScaler
# # PUTTING TOGETHER THE PREDICTORS TO RUN IN THE REGRESSION
# ## Last min dummy vars []'cat_grade','cat_zipcode','cat_view','cat_bins_sqft_above','cat_bins_sqft_basement']
# dum_grades = pd.get_dummies(df_run['cat_grade'],prefix='gr').iloc[:,:-1]
# dum_view = pd.get_dummies(df_run['cat_view'], prefix='view').iloc[:,:-1]
# dum_sqft_above = pd.get_dummies(df_run['cat_bins_sqftabove'],prefix='sqftAb').iloc[:,:-1]
# dum_sqft_base = pd.get_dummies(df_run['cat_bins_sqftbasement'],prefix='sqftBa').iloc[:,:-1]
# RUNNING K-FOLD VALIDATION WITH STATSMODELS OLS.
# X = df_run.drop(['price','logZ_price'],axis=1)
# list_predictors = ['logZ_sqft_living','logZ_sqft_living15','bedrooms','bathrooms','floors']
# scaler = MinMaxScaler()
# sel_columns = selected_features.index
# Define X, Y
X = df_run[try_modeling]
# X.columns
Y = df_run['price']
# y = df_run['logZ_price']
# Get a list of predictor names string
list_predictors = [str(x) for x in X.columns]
list_predictors.append('intercept')
list_predictors
# Comcatenate X,Y for OLS
df_run_ols = pd.concat([Y,X],axis=1)
# Import packages
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.formula.api as smf
import scipy.stats as stats
# Enter equation for selected predictors: (use C to run as categorical)
# f1 = 'price~C(codezipcode)+C(grade)+sca_sqft_living+sca_sqft_living15' # 0.8 r1 Adjusted
f1 = 'price~C(codezipcode)+grade+sca_sqft_living+sca_sqft_living15'
# Run model and report sumamry
model = smf.ols(formula=f1, data=df_run_ols).fit()
model.summary()
- f1 = 'price ~ C(codezipcode) + grade + sca_sqft_living + sca_sqft_living15'
# df_run_ols.to_csv(data_filepath+'df_run_ols_model.csv')
# Visualize Q-Q Plots
resid1=model.resid
fig = sm.graphics.qqplot(resid1, dist=stats.norm, line='45', fit=True,marker='.')
# Visualizing final dataset again.
import matplotlib.pyplot as plt
# Re-inspecting XY
# plot_hist_scat(df_run_ols)
# k_fold_val_ols(X,y,k=10):
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
y = df_run['price']
# Run 10-fold cross validation
results = [['set#','R_square_train','MSE_train','R_square_test','MSE_test']]
num_coeff = X.shape[1]
list_predictors = [str(x) for x in X.columns]
list_predictors.append('intercept')
reg_params = [list_predictors]
i=0
k=10
while i <(k+1):
X_train, X_test, y_train, y_test = train_test_split(X,y) #,stratify=[cat_col_names])
data = pd.concat([X_train,y_train],axis=1)
f1 = 'price~C(codezipcode)+grade+sca_sqft_living+sca_sqft_living15'
model = smf.ols(formula=f1, data=data).fit()
model.summary()
y_hat_train = model.predict(X_train)
y_hat_test = model.predict(X_test)
train_residuals = y_hat_train - y_train
test_residuals = y_hat_test - y_test
train_mse = metrics.mean_squared_error(y_train, y_hat_train)
test_mse = metrics.mean_squared_error(y_test, y_hat_test)
R_sqare_train = metrics.r2_score(y_train,y_hat_train)
R_square_test = metrics.r2_score(y_test,y_hat_test)
results.append([i,R_sqare_train,train_mse,R_square_test,test_mse])
i+=1
results = pd.DataFrame(results[1:],columns=results[0])
results
model.summary()
results
resid1=model.resid
fig = sm.graphics.qqplot(resid1, dist=stats.norm, line='45', fit=True,marker='.')
df_model = pd.concat([df_run[try_modeling],df_run['price']],axis=1)
# df_model.to_csv(data_filepath+'df_housing_model.csv')
# df_run.to_csv(data_filepath+'df_house_all_final_data.csv')
results.describe()
For k=10 fold validation, with price as target variable:
f1 = 'price~C(codezipcode)+C(grade)+sca_sqft_living+sca_sqft_living15'
Predictors in final model:
My final model indicates that the size, location, and housing geade to be critical components in determining salesprice.
df_dropped.index
df_run.describe()
import pandas as pd
df_final_data=pd.concat([df_run, df_dropped[['lat','long','id']]],axis=1)
df_final_data.info()
# save final output
df_final_data.to_csv(data_filepath+'kc_housing_model_df_final_data.csv')
# Reset the visual style of the notebook
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams.update(inline_rc)
# inline_rc = dict(mpl.rcParams)
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
plt.style.use('dark_background')
# plt.style.use('dark')
# Define the figure and axes and the # of subplots, sharing the y axes
fig, ax = plt.subplots(figsize=(16,12), ncols=2, nrows=2, sharey=True)
## Defining Formatting to be Used
# Formatting dollar sign labels
fmtPrice = '${x:,.0f}'
tickPrice = mtick.StrMethodFormatter(fmtPrice)
# Axis Label fonts
fontTitle = {'fontsize': 20,
'fontweight': 'bold',
'fontfamily':'serif'}
fontAxis = {'fontsize': 16,
'fontweight': 'bold',
'fontfamily':'serif'}
fontTicks = {'fontsize': 12,
'fontweight':'medium',
'fontfamily':'serif'}
# The amount of space above titles
y_title_margin = 1.01
# Major title
# plt.suptitle("Critical Factors for Predicting Sales Price", y = 1.0, fontdict=fontTitle, fontsize=22)
## Subplot 1
i,j=0,0
ax[i,j].set_title("Zipcode",y = y_title_margin,fontdict=fontTitle)#, y = y_title_margin)
sns.stripplot(df_final_data['zipcode'],df_final_data['price'],ax=ax[i,j],marker='o',size=3)
# Remove xticks
ax[i,j].set_xticks([]), ax[i,j].set_xlabel('')
# Change y-tick labels
ax[i,j].set_ylabel('Price',fontdict=fontAxis)
yticklab = ax[i,j].get_yticklabels()
ax[i,j].set_yticklabels(yticklab,fontdict=fontTicks)
ax[i,j].get_yaxis().set_major_formatter(tickPrice)
# Set y-grid
ax[i, j].set_axisbelow(True)
ax[i, j].grid(axis='y',ls='--')
## Subplot 2
i,j = 0,1
ax[i,j].set_title("Housing Grade",y = y_title_margin,fontdict=fontTitle)
ax[i,j].title.set_fontsize(20)
sns.stripplot(df_final_data['grade'],df_final_data['price'],ax=ax[i,j],marker='o',size=2)
#Set x axis
xticklab=ax[i,j].get_xticklabels()
ax[i,j].set_xticklabels(xticklab,fontdict=fontTicks)
ax[i,j].set_xlabel('Grade')
# Change y-tick labels
ax[i,j].set_ylabel('')# 'Price',fontdict=fontAxis)
# Set y-grid
ax[i, j].set_axisbelow(True)
ax[i, j].grid(axis='y',ls='--')
yticklab = ax[i,j].get_yticklabels()
ax[i,j].set_yticklabels(yticklab,fontdict=fontTicks)
ax[i,j].get_yaxis().set_major_formatter(tickPrice)
## Subplot 3
i,j = 1,0
# Title
ax[i,j].set_title("Living Space (sqft)",y = y_title_margin,fontdict=fontTitle)
ax[i,j].title.set_fontsize(20)
# Define the scatter plot and line graph aesthetics
line_kws={"color":"orange","alpha":0.5,"lw":8,"ls":":"}
scatter_kws={'s': 5, 'alpha': 0.5,'marker':'.','color':'red'}
# Plot seaborn plot
sns.regplot(df_final_data['sqft_living'], df_final_data['price'],ax=ax[i,j], scatter_kws=scatter_kws, line_kws=line_kws) #,marker='o',size=2)
# sns.stripplot(df_final_data['sqft_living'], df_final_data['price'],ax=ax[i,j],marker='.') #,marker='o',size=2)
## Change the x-axis
ax[i,j].set_xlabel('Area (sqft)',fontdict=fontAxis)
# Get ticks, rotate labels, and return
# xticks = ax[i,j].get_xticks()
xticklab=ax[i,j].get_xticklabels()
ax[i,j].set_xticklabels(xticklab,fontdict=fontTicks, rotation=45)
# Change the major units of x-axis
ax[i,j].xaxis.set_major_locator(mtick.MultipleLocator(500))
ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())
## Change y-axis
# Change y-tick labels
ax[i,j].set_ylabel('Price',fontdict=fontAxis)
yticklab = ax[i,j].get_yticklabels()
ax[i,j].set_yticklabels(yticklab,fontdict=fontTicks)
ax[i,j].get_yaxis().set_major_formatter(tickPrice)
# Set y-grid
ax[i, j].set_axisbelow(True)
ax[i, j].grid(axis='y',ls='--')
# ## Subplot 4
i,j = 1,1
ax[i,j].set_title("Neighbor's Living Space (sqft)",y = y_title_margin,fontdict=fontTitle)
ax[i,j].title.set_fontsize(20)
# Define the scatter plot and line graph aesthetics
line_kws={"color":"lime","alpha":0.5,"lw":8,"ls":":"}
scatter_kws={'s': 5, 'alpha': 0.5,'marker':'.','color':'blueviolet'}
# Plot seaborn plot
sns.regplot(df_final_data['sqft_living15'], df_final_data['price'],ax=ax[i,j], scatter_kws=scatter_kws, line_kws=line_kws)
# Change the x-axis labels
ax[i,j].set_xlabel('Area (sqft)',fontdict=fontAxis)
# Get ticks, rotate labels, and return
xticklab=ax[i,j].get_xticklabels()
ax[i,j].set_xticklabels(xticklab,fontdict=fontTicks, rotation=45)
# Change the major units of x-axis
ax[i,j].xaxis.set_major_locator(mtick.MultipleLocator(500))
ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())
# Change y-tick labels
ax[i,j].set_ylabel('')#Price',fontdict=fontAxis)
yticklab = ax[i,j].get_yticklabels()
ax[i,j].set_yticklabels(yticklab,fontdict=fontTicks)
ax[i,j].get_yaxis().set_major_formatter(tickPrice)
# Set y-grid
ax[i, j].set_axisbelow(True)
ax[i, j].grid(axis='y',ls='--')
plt.tight_layout()
plt.savefig(fig_filepath+"summary_figure.png") # save as png
from IPython.display import Image
Image("Final Figures/map_median_price.png")
Image("Final Figures/map_latlong_price.png")
# plt.savefig(fig_filepath+"summary_figure.png") # save as png
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
plt.style.use('dark_background')
# figSum, ax = plt.subplots(nrows=2,ncols=2,figsize=(16,16),sharey=True)
fig, ax = plt.subplots(figsize=(16,12), ncols=2, nrows=2, sharey=True)
# Formatting dollar sign labels
fmt = '${x:,.0f}'
tick = mtick.StrMethodFormatter(fmt)
# Axis Label fonts
fontTitle = {'fontsize': 20,
'fontweight': 'bold',
'fontfamily':'serif'}
fontAxis = {'fontsize': 16,
'fontweight': 'bold',
'fontfamily':'serif'}
fontTicks = {'fontsize': 12,
'fontweight':'medium',
'fontfamily':'serif'}
# The amount of space above titles
y_title_margin = 1.01
# Major title
# plt.suptitle("Critical Factors for Predicting Sales Price", y = 1.0, fontdict=fontTitle, fontsize=22)
## Subplot 1
i,j=0,0
ax[i,j].set_title( "Zipcode", y = y_title_margin, fontdict = fontTitle)
sns.stripplot(df_run_ols['codezipcode'],df_run_ols['price'],ax=ax[i,j],marker='o',size=3,jitter=True)
# Remove xticks
ax[i,j].set_xticks([]), ax[i][j].set_xlabel('')
# Change y-tick labels
ax[i][j].set_ylabel('Price',fontdict=fontAxis)
yticklab = ax[i][j].get_yticklabels()
ax[i][j].set_yticklabels(yticklab,fontdict=fontTicks)
ax[i,j].get_yaxis().set_major_formatter(tick)